maReg = function(select = "", print = TRUE){
reg = lm(witPerAut ~ TextPerAut,
data = as.data.frame(autTrads[colors == select,]))
if(print == TRUE){
print(summary(reg))
}
abline(reg, col = select)
}
Distributions of the number of witnesses / texts / author, in relation.
Chronological distributions ?
Geographical distributions ?
# Remove vidas
BeDT = BeDT[grep(",", BeDT[,"repertorio_n"]), ]
# Les pièces non strictement lyriques: à garder ou pas ?
BeDT = BeDT[grep("[IVX]+", BeDT[,"repertorio_n"], invert = TRUE), , drop = TRUE]
# Add generation information
# get the frequencies of the texts
textsFreqs = table(as.character(BeDT[,"repertorio_n"]))
#View(sort(textsFreqs, decreasing = TRUE))
textsFreqs = as.data.frame(textsFreqs)
# plot it
#plot(textsFreqs[,2])
#barplot(textsFreqs[,2])
# hist(textsFreqs[,2], breaks = seq(min(textsFreqs[,2])-0.5, max(textsFreqs[,2])+0.5, by=1), main = "Distribution of witnesses per text", xlab = "number of witnesses", include.lowest = TRUE)
# or, better
plot(table(textsFreqs[,2]), type = "h", col = "red", lwd = 10, main = "Distr. of witnesses per troubadour text", xlab = "number of witnesses", ylab = "Freqs", sub = paste("N = ", length(textsFreqs[,2])))
#boxplot(textsFreqs[,2])
#summary(textsFreqs[,2])
# geometric mean
exp(mean(log(textsFreqs[,2])))
## [1] 2.715444
# frequencies for each numeric value
numFreqs = table(textsFreqs[,2])
numFreqs = as.data.frame(numFreqs)
numFreqs = sapply(numFreqs, as.numeric)
plot(numFreqs[,1], numFreqs[,2], log = "xy", main = "Distr. of wits per troubadour text - logarithmic scale", xlab = "number of witnesses", ylab = "freqs")
Petit florilège:
head(textsFreqs[order(textsFreqs[,2], decreasing = TRUE), ])
## Var1 Freq
## 1747 BEdT 364,039 28
## 660 BEdT 155,001 26
## 675 BEdT 155,016 26
## 678 BEdT 155,021 25
## 673 BEdT 155,014 24
## 763 BEdT 167,059 24
# save
troubtextsFreqs = textsFreqs
# add gen to autText
#autText = autText
autTextGen = cbind(autText, as.character(BeDT_auts[autText[, "aut"], ][, "gen"]), as.character(BeDT_auts[autText[, "aut"], ][, "gen"]))
colnames(autTextGen)[3:4] = c("gen", "indic")
autTextGen[, "gen"][grep("^1", autTextGen[, "gen"])] = "-1150"
autTextGen[, "gen"][grep("^2", autTextGen[, "gen"])] = "1150-1175"
autTextGen[, "gen"][grep("^3", autTextGen[, "gen"])] = "1170-1210"
autTextGen[, "gen"][grep("^4", autTextGen[, "gen"])] = "1190-1235"
autTextGen[, "gen"][grep("^5", autTextGen[, "gen"])] = "1230-1265"
autTextGen[, "gen"][grep("^6", autTextGen[, "gen"])] = "1260-"
autTextGen[, "gen"][grep("^(0|9|a)", autTextGen[, "gen"])] = "?"
plot(as.factor(autTextGen[, "gen"]), main="Nombre de textes par génération\nsource: BeDT", las = 2)
# Get sigla
sigla = as.factor(gsub("p\\_\\°?\\^?([A-Za-z0-9]+)\\_.*$", "\\1", BeDT[,"SIGLA"]))
levels(sigla)[grep("omega", levels(sigla))] = "omega"
levels(sigla)[grep("psi", levels(sigla))] = "psi"
levels(sigla)[grep("eta", levels(sigla))] = "eta"
levels(sigla)[grep("Bamb", levels(sigla))] = "bamberg136"
levels(sigla)[grep("BAV, PL", levels(sigla))] = "PalLat753"
levels(sigla)[grep("BAV, BL", levels(sigla))] = "BarbLat3953"
levels(sigla)[grep("Barc - 239", levels(sigla))] = "Barc239"
levels(sigla)[grep("Barc - 850", levels(sigla))] = "Barc850"
levels(sigla)[grep("Str.App.8", levels(sigla))] = "StrApp8"
levels(sigla)[grep("Nü - II.77", levels(sigla))] = "NurnbergII77"
levels(sigla)[grep("Mü", levels(sigla))] = "MunchenLat759"
levels(sigla)[grep("Harl - 4041", levels(sigla))] = "Harley3041"
levels(sigla)[grep("MI - D.55_0001", levels(sigla))] = "MilanoD55sup"
#write.csv("sigla.csv", x = levels(sigla))
BeDT = cbind(BeDT, sigla)
# Vegan calso can estimate series of R ́enyi and Tsal-lis diversities. R ́enyi diversity of orderais (Hill,1973) -> TODO: voir cet indice
library("vegan")
# Evolution through time
# First, building data
uniqueTexts = sort(unique(paste(autTextGen[,1], ',', autTextGen[,2], sep = '')))
TextsByGen = matrix(nrow = length(uniqueGens), ncol = length(uniqueTexts), dimnames = list(uniqueGens, uniqueTexts), data = 0)
for(i in 1:length(uniqueGens)){
thisGenTexts = table(paste(autTextGen[autTextGen[, "gen"] == uniqueGens[i], ][,1], ',', autTextGen[autTextGen[, "gen"] == uniqueGens[i], ][,2] , sep=""))
for(j in 1:length(thisGenTexts)){
TextsByGen[uniqueGens[i], names(thisGenTexts[j])] = thisGenTexts[j]
}
}
# And now, global diversity
TextGens_diversities = vegan::diversity(TextsByGen, index = "shannon")
TextGens_diversities
## -1150 1150-1175 1170-1210 1190-1235 1230-1265 1260-
## 4.041343 5.135258 6.019933 6.052840 5.538136 5.984030
# And distributions and means per period
plot(TextGens_diversities, type="b", sub = paste("Source = BedT -- N. wits = ", sum(TextsByGen)), xlab = "Generations", xaxt="n", ylab="Shannon div.", main = "Generations as sites, texts as species, witnesses as individuals")
axis(1, at=1:6, labels=uniqueGens)
Some stats on the lost works.
#cons et att.: q, S, Y, W, I, C, gamma, B, Z, E, f, delta, T, X, R, n, p
#cons., non att.: Ch, G?, psi,
# perdus: 4
data = matrix(c(
17,
3,
4,
0
),
ncol = 1,
dimnames = list(c("cons. et att.", "cons., non att.?", "perdus et att.", "perdus non att. ?"), "freq")
)
barplot(data, beside = TRUE, names.arg = rownames(data), las = 2)
tauxPerte = round(data[3,] / sum(data) * 100)
library(ggplot2)
data = as.data.frame(data)
ggplot(data=data, aes(x=rownames(data), y=data[,1])) +
geom_bar(stat="identity") + xlab(paste("pertes > ", tauxPerte, "%")) + ylab( "Fréq.") + ggtitle("Chans. en Fr. XVI-XVIII")
Use mass instead of density, because discrete (or account for the partiality of witnesses, e.g., a witness with a half of the text, or 0.65 of the text, etc. For now, a fragment of 2 verses is a witness in the same way as a complete manuscript of 2000 verses. But there is no extant data for this. ).
Selon wikipedia en, la formule de densité est \(P(X > x) \sim L(x)x^{-(\alpha+1)}\), où \(\alpha > 0\) et \(L(x)\) is a slowly varying function, qui contrôle la forme finie de la queue. Si \(L(x)\) est une constante, alors on a vraiment une loi de puissance.
La plupart du temps, on doit fixer une valeur minimale \(x_{min}\) à partir de laquelle la loi vaut (mais pas avant).
# and now density
#plot(density(textsFreqs[,2]), xlim = c(1,30), xaxs = "i")
# or rather, mass
plot(prop.table(table(textsFreqs[,2])), xlab = "n. witn.", ylab = "mass", main = "Troubadour Poems")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
# la même chose en log
plot(prop.table(table(textsFreqs[,2])), xlab = "n. witn.", ylab = "mass", xlim = c(1,30), ylim = c(0.01, 0.4), log = "xy", main = "Troubadour Poems", sub = "log/log plot")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
Estimation with regression
data = cbind(as.numeric(labels(table(textsFreqs[,2]))[[1]]), table(textsFreqs[,2]))
Cela semble s’appliquer aussi aux témoins de chansons de geste,
plot(prop.table(RepTrad[,1]), xlab = "n. witn.", ylab = "mass", main = "Chansons de geste")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
# la même chose en log
plot(prop.table(RepTrad[,1]), xlab = "n. witn.", ylab = "mass", xlim = c(1,30), ylim = c(0.01, 0.4), log = "xy", main = "Chansons de geste", sub = "log/log plot")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 12 y values <= 0 omitted from
## logarithmic plot
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
NB: Sauf qu’en fait, la constante au pif que j’ai utilisée devrait plutôt être une constante normalisatrice, telle que $p(x) = ( )^{-} $. En fait, je retombais vaguement sur cela empiriquement, vu que je supposais que \(x_{min}\) était 1 et que je donnais des valeurs voisines à la constante et \(\alpha - 1\). Donc passer de 1.3 à 1.4.
poweRlawMaintenant, on peut aussi essayer de trouver un fit via un algorithme
dédié, par exemple via le module poweRlaw.
À voir: que fait le plot de poweRlaw exactement ? Il
trace des CDF (cumulative degressive frequencies ?). Renvoient à
l’équation 3.9 dans Clauset et al. (2009)
troub.pl = poweRlaw::displ$new(textsFreqs[,2])
troub.pl$setXmin(1)
(est = poweRlaw::estimate_pars(troub.pl))
## $pars
## [1] 1.680934
##
## $value
## [1] 6097.674
##
## $counts
## function gradient
## 7 7
##
## $convergence
## [1] 0
##
## $message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
##
## attr(,"class")
## [1] "estimate_pars"
geste.pl = poweRlaw::displ$new(RepTradData[,1])
geste.pl$setXmin(1)
(est = poweRlaw::estimate_pars(geste.pl))
## $pars
## [1] 1.765307
##
## $value
## [1] 442.4034
##
## $counts
## function gradient
## 7 7
##
## $convergence
## [1] 0
##
## $message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
##
## attr(,"class")
## [1] "estimate_pars"
Ce qui voudrait dire \(\alpha\) à 1,69 pour les troubadours et 1,76 pour les chansons de geste, avec \(x_{min} = 1\).
Mais on peut aussi chercher à optimiser \(x_{min}\):
(est1 = poweRlaw::estimate_xmin(troub.pl))
## $gof
## [1] 0.06748236
##
## $xmin
## [1] 23
##
## $pars
## [1] 11.04859
##
## $ntail
## [1] 11
##
## $distance
## [1] "ks"
##
## attr(,"class")
## [1] "estimate_xmin"
(est2 = poweRlaw::estimate_xmin(geste.pl))
## $gof
## [1] 0.07572512
##
## $xmin
## [1] 6
##
## $pars
## [1] 2.954922
##
## $ntail
## [1] 38
##
## $distance
## [1] "ks"
##
## attr(,"class")
## [1] "estimate_xmin"
Cela serait 23 pour les troubadours et 6 pour les chansons de geste. À partir de là, les valeurs d’\(\alpha\) seraient 11.0485886 et 2.9549223.
troub.pl$setXmin(est1$xmin)
troub.pl$setPars(est1$pars)
poweRlaw::plot(troub.pl)
poweRlaw::lines(troub.pl, col = 2)
#pas top, et ainsi
troub.pl$setXmin(1)
troub.pl$setPars(1.68)
poweRlaw::plot(troub.pl)
poweRlaw::lines(troub.pl, col = 2)
Pour être sûr qu’on soit bien en présence d’une loi de puissance, on
peut tester la procédure fondée sur Clauset et al.
library("poweRlaw") # Nécessaire pour éviter un bug. Espace de nom mal déclaré dans le package ?
bs_p = poweRlaw::bootstrap_p(troub.pl, no_of_sims=100, threads=2)# sims passés de 1000 à 100 pour limiter temps d'exécution dans cette feuille.
## Expected total run time for 100 sims, using 2 threads is 7.44 seconds.
# ici, p serait égal à 0 ce qui signifierait qu'on n'a pas une loi de puissance du tout
troub.pl$setXmin(est1$xmin)
troub.pl$setPars(est1$pars)
bs_p2 = poweRlaw::bootstrap_p(troub.pl, no_of_sims=100, threads=2)
## Expected total run time for 100 sims, using 2 threads is 6.18 seconds.
# toujours 0, même en changeant les pars
# Fait-maison
plot(x = rank(-textsFreqs[,2], ties.method = "random"), y = textsFreqs[,2])
plot(x = rank(-textsFreqs[,2], ties.method = "random"), y = textsFreqs[,2], log="xy")
# zipfR
Voir zipfR.
Fitting a distribution.
Pour une liste des noms de distributions standards dans R, cf. la doc Distributions {stats}
fit_pl = igraph::fit_power_law(textsFreqs[,2], implementation = "R.mle")
fit_pl
##
## Call:
## stats4::mle(minuslogl = mlogl, start = list(alpha = start))
##
## Coefficients:
## alpha
## 1.674228
stats4::logLik(fit_pl)
## 'log Lik.' -6094.22 (df=1)
fit_pl = igraph::fit_power_law(textsFreqs[,2], implementation = "plfit")
fit_pl
## $continuous
## [1] FALSE
##
## $alpha
## [1] 15.73528
##
## $xmin
## [1] 22
##
## $logLik
## [1] -35.22062
##
## $KS.stat
## [1] 0.05434208
## Le chargement a nécessité le package : MASS
## Le chargement a nécessité le package : survival
plotdist(textsFreqs[,2], histo = TRUE, demp = TRUE, discrete = TRUE)
descdist(textsFreqs[,2], discrete=TRUE, boot = 500)
## summary statistics
## ------
## min: 1 max: 28
## median: 2
## mean: 4.614592
## estimated sd: 5.118794
## estimated skewness: 1.640177
## estimated kurtosis: 4.977446
#fit_exp = fitdist(textsFreqs[,2], "exp", discrete = TRUE)
#plot(fit_exp)
#dmyPlaw = function(a,x,k) a * x^-k
#fit_pl = fitdist(textsFreqs[,2], "myPlaw", start = list(a = 1, k = 1))
# Partons sur poisson, binomial, negative binomial, geometrique, hypergeometrique
fit_p = fitdist(textsFreqs[,2], "pois") # -> pas ça.
fit_p
## Fitting of the distribution ' pois ' by maximum likelihood
## Parameters:
## estimate Std. Error
## lambda 4.614592 0.0427754
# Donc, lambda serait 4.44 et son écart-type sqrt(3.67)
plot(fit_p)
#fit_b = fitdist(wl, "binom", lower = c(0, 0))
fit_nb = fitdist(textsFreqs[,2], "nbinom") # -> toujours pas.
plot(fit_nb)
fit_g = fitdist(textsFreqs[,2], "geom")# -> niet
plot(fit_g)
#Et les distribs avec paramètres
#prefit(textsFreqs[,2], "hyper")
#fit_hg = fitdist(textsFreqs[,2], "hyper")
denscomp(list(fit_p, fit_nb, fit_g))
#cdfcomp (list(fit_w, fit_g, fit_ln), legendtext = plot.legend)
# En le traitant comme variable continue
plotdist(textsFreqs[,2], histo = TRUE, demp = TRUE, discrete = FALSE)
descdist(textsFreqs[,2], discrete=FALSE, boot = 500)
## summary statistics
## ------
## min: 1 max: 28
## median: 2
## mean: 4.614592
## estimated sd: 5.118794
## estimated skewness: 1.640177
## estimated kurtosis: 4.977446
plot(log(numFreqs[,1]), log(numFreqs[,2]), main = "Distr. of wits per troubadour text - log / log plot", xlab = "log(number of witnesses)", ylab = "log(freqs)")
colnames(numFreqs)[1] = "NbWits"
reg = lm(log(Freq) ~ log(NbWits), data = as.data.frame(numFreqs))
abline(reg, col="red")
summary(reg)
##
## Call:
## lm(formula = log(Freq) ~ log(NbWits), data = as.data.frame(numFreqs))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8901 -0.3787 0.1810 0.5814 0.9218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6040 0.4662 16.309 7.81e-15 ***
## log(NbWits) -1.7751 0.1843 -9.631 6.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7907 on 25 degrees of freedom
## Multiple R-squared: 0.7877, Adjusted R-squared: 0.7792
## F-statistic: 92.75 on 1 and 25 DF, p-value: 6.817e-10
mtext(text = paste("Adj. R² ", round(summary(reg)$adj.r.squared, digits = 3 )), line = 4, side= 1, cex=1)
##Répartition par siècle des manuscrits
barplot(RepSiecle[,1], names.arg = rownames(RepSiecle), main = "Epic mss per century", sub="From Duggan (1982), Careri (2006) and Careri et al. (2011)")
barplot(RepTranche[,3], names.arg = rownames(RepTranche), main = "Epic mss per half-century", sub="From Duggan (1982), Careri (2006) and Careri et al. (2011)")
barplot(RepTrad[,1], names.arg = rownames(RepTrad), main = "Distr. of wits per epic text", sub = "from Vitale-Brovarone (2006)", ylab = "Freqs", xlab = "nb. witnesses")
plot(RepTrad[,1], type = "h", col = "red", lwd = 10, main = "Distr. of wits per epic text \n Data: Vitale-Brovarone (2006)", xlab = "number of witnesses", ylab = "Freqs", sub = paste("N = ", sum(RepTrad[,1])), xlim = c(0.1,30), ylim = c(1,80), xaxs = "i", yaxs = "i")
#mtext(text = "nb. de témoins", line = 2, side= 1, cex=1.2)
plot(density(RepTrad[,1]), main = "Kernel Density Estimation", sub = "Distribution of witnesses per epic text")
w = vector()
for(i in seq_len(length(RepTrad[,1]))){
w = c(w, rep(i, RepTrad[i,1]))
}
summary(w)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 3.498 4.000 29.000
plot(prop.table(RepTrad[,1]), xlab = "n. witn.", ylab = "mass", main = "Distr. of witness per epic text", type = "h")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
# la même chose en log
plot(prop.table(RepTrad[,1]), xlab = "n. witn.", ylab = "mass", xlim = c(1,30), ylim = c(0.01, 0.4), log = "xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 12 y values <= 0 omitted from
## logarithmic plot
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.3, add = TRUE, col = "red")
###Description de la distribution
table(RepTradData[,1])
##
## 1 2 3 4 5 6 7 8 9 10 11 13 14 15 17 22 29
## 76 44 28 11 8 8 10 1 3 4 2 3 1 1 3 1 1
#plot(RepTradData[,1], ylab = "Nb. de témoins", main = "Distrib. des chansons selon leur nb. de témoins", sub="d'après Vitale-Brovarone (2006)")
#Graphes log/log
#plot(log(RepTradData[,1]), ylab = "Nombre de mss", main = "Distrib. des chansons selon leur nb. de mss", sub="d'après Vitale-Brovarone (2006)")
#Vaut-il mieux les faire en barplot?
RepTrad2 = RepTrad2[RepTrad2[,1] > 0 , ]
plot(RepTrad2[,2], RepTrad2[,1], main="Fréquence en chansons pour le nb. de témoins", ylab="Fréq. des chansons", xlab="Nb. de témoins")
plot(RepTrad2[,2], RepTrad2[,1], log="xy", main = "Distr. of wits per epic text - logarithmic scale", xlab = "number of witnesses", ylab = "freqs")
#hist(RepTradData[,1], breaks = 1:30)
#hist(log(RepTradData[,1]))
#plot(RepTrad2, log="xy", type='h')
#boxplot(RepTradData)
summary(RepTradData)
## Nb.de.témoins
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 3.498
## 3rd Qu.: 4.000
## Max. :29.000
#Moyenne géométrique
exp(mean(log(RepTradData[,1])))
## [1] 2.33514
Status of the tradition for major versions,
epicWorks = read.csv("data/geste_works.csv")
# Quick reordering
epicWorks[, "Statut.trad."] = factor(epicWorks[, "Statut.trad."], levels = c("kept", "fragm", "lost"))
epicWorks[, "Trad.hypoth"] = factor(epicWorks[, "Trad.hypoth"], levels = c("hypoth", "attested"))
ggplot(data=epicWorks, aes(Statut.trad.)) + geom_bar(aes(fill=Trad.hypoth), color = "black") + ggtitle("Gestes: Major versions") +
xlab("Status of the tradition") + theme(axis.text.x = element_text(size = rel(1.2), face = "bold")) + scale_fill_manual(values=c("darkgray", "darkred"))
and for works,
meta = unique(epicWorks[, "Meta"])
worksStatus = matrix(nrow = length(meta), ncol = 3, dimnames = list(NULL, c("Meta", "StatusTrad", "Trad.hypoth")))
worksStatus[, "Trad.hypoth"] = "attested"
for (i in 1:length(meta)){
worksStatus[i, "Meta"] = as.character(meta[i])
# if there is at least one kept version
if("kept" %in% epicWorks[epicWorks[, "Meta"] == meta[i],][, "Statut.trad."]){
worksStatus[i,"StatusTrad"] = "kept"
} else{
if("fragm" %in% epicWorks[epicWorks[, "Meta"] == meta[i],][, "Statut.trad."]){
worksStatus[i,"StatusTrad"] = "fragm"
} else{
if("lost" %in% epicWorks[epicWorks[, "Meta"] == meta[i],][, "Statut.trad."]){
worksStatus[i,"StatusTrad"] = "lost"
if(!"attested" %in% epicWorks[epicWorks[, "Meta"] == meta[i],][, "Trad.hypoth"]){
worksStatus[i, "Trad.hypoth"] = "hypoth"
}
}
}
}
}
worksStatus = as.data.frame(worksStatus)
# Quick reordering
worksStatus[, "StatusTrad"] = factor(worksStatus[, "StatusTrad"], levels = c("kept", "fragm", "lost"))
worksStatus[, "Trad.hypoth"] = factor(worksStatus[, "Trad.hypoth"], levels = c("hypoth", "attested"))
ggplot(data=worksStatus, aes(StatusTrad)) + geom_bar(aes(fill=Trad.hypoth), color = "black") + ggtitle("Gestes: Works") +
xlab("Status of the tradition") + theme(axis.text.x = element_text(size = rel(1.2), face = "bold")) + scale_fill_manual(values=c("darkgray", "darkred"))
ggplot(data=epicWorks, aes(StandDate)) + geom_bar(aes(fill=Statut.trad.), color = "black") + ggtitle("Gestes: Major versions by century") +
xlab("Status of the tradition") + theme(axis.text.x = element_text(size = rel(1.2), face = "bold"))
catalogues = matrix(data = c(8, 33, 4), ncol = 1, dimnames = list(c("Ms. identif.", "Ms. unidentif.", "Text (& ms.) unindentif."), "counts"))
barplot(t(catalogues), main = "Epic MSS in British Med. Catalogues", sub = "No identif. (lost?) for c. 75%")
# Alternative version with GGPlot, faking actual data
catalogues = matrix(nrow = (8+33+4), ncol = 2,
data = c(rep("identified", 8), rep("unidentif.", 37),
rep("known", (8+33)), rep("unknown", 4)),
dimnames = list(NULL, c("MS.", "Text"))
)
catalogues = as.data.frame(catalogues)
ggplot(data=catalogues, aes(MS.)) + geom_bar(aes(fill=Text), color = "black") + ggtitle("Epic MSS in British Med. Catalogues") + xlab("No identif. (lost?) for c. 75%") + theme(axis.text.x = element_text(size = rel(1.2), face = "bold"))
Si l’on suppose un taux de décimation d’un manuscrit conservé sur 1000 copiés
Decimation=1000
RepTradSuppose = RepTrad2
for(i in 1:nrow(RepTradSuppose)){
RepTradSuppose[i,2] = RepTradSuppose[i,2] * Decimation
}
plot(log(RepTradSuppose[,2]), log(RepTradSuppose[,1]), main=paste('Fréquence en chansons pour le nb. de témoins \n Décimation supposée:', Decimation-1, 'sur', Decimation), ylab="log(Fréq. des chansons)", xlab="log(Nb. de témoins)", xlim = c(0,10), ylim=c(0,20))
reg = lm(log(Fréquence..chansons.) ~ log(Nb..de.témoins), data=RepTradSuppose)
abline(reg, untf = TRUE, col="red")
mtext(paste("R² ajusté", round(summary(reg)$adj.r.squared, digits = 4)), side = 1, line=4)
#Régression
summary(reg)
##
## Call:
## lm(formula = log(Fréquence..chansons.) ~ log(Nb..de.témoins),
## data = RepTradSuppose)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.45519 -0.18416 0.02561 0.35412 0.74680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.6108 1.4251 10.252 3.60e-08 ***
## log(Nb..de.témoins) -1.4638 0.1585 -9.234 1.41e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.557 on 15 degrees of freedom
## Multiple R-squared: 0.8504, Adjusted R-squared: 0.8404
## F-statistic: 85.27 on 1 and 15 DF, p-value: 1.411e-07
Situation un peu différente ici, car la base n’enregistre que (une sélection d’) œuvres latines à succès.
# plot it
plot(textsFreqs[,2])
#barplot(textsFreqs[,2])
# hist(textsFreqs[,2], breaks = seq(min(textsFreqs[,2])-0.5, max(textsFreqs[,2])+0.5, by=1), main = "Distribution of witnesses per text", xlab = "number of witnesses", include.lowest = TRUE)
# or, better
plot(table(textsFreqs[,2]), type = "h", col = "red", lwd = 10, main = "Distr. of witnesses per mediolatin successfull text", xlab = "number of witnesses", ylab = "Freqs", sub = paste("N = ", length(textsFreqs[,2])))
#boxplot(textsFreqs[,2])
#summary(textsFreqs[,2])
# geometric mean
exp(mean(log(textsFreqs[,2])))
## [1] 62.48062
# frequencies for each numeric value
numFreqs = table(textsFreqs[,2])
numFreqs = as.data.frame(numFreqs)
numFreqs = sapply(numFreqs, as.numeric)
plot(numFreqs[,1], numFreqs[,2], log = "xy", main = "Distr. of wits per troubadour text - logarithmic scale", xlab = "number of witnesses", ylab = "freqs")
# or rather, mass
plot(prop.table(table(textsFreqs[,2])), xlab = "n. witn.", ylab = "mass", main = "Fama")
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.4, add = TRUE, col = "red")
# la même chose en log
plot(prop.table(table(textsFreqs[,2])), xlab = "n. witn.", ylab = "mass", xlim = c(1,4000), ylim = c(0.01, 0.06), log = "xy", main = "Fama", sub = "log/log plot")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
curve(0.4 * x^-1.4, add = TRUE, col = "red")
Number of copies per work
edFreqs = table(ISTC_wits[,"text"])
booksPerWorks = as.data.frame(edFreqs)
#View(edFreqs[with(edFreqs, order(-edFreqs[,2])), ])
plot(table(booksPerWorks[,2]), type = "h", col = "red", lwd = 10, xlab = "n. copies", main = "Incunabula printed in Italy \n Extant copies per work", ylab = "Freqs", sub = paste("Source: ISTC -- N = ", sum(booksPerWorks[,2])))
What are the texts with most books ?
#View(edFreqs)
Exemplaries by edition
edFreqs = table(ISTC_books[,"X_id"])
edFreqs = as.data.frame(edFreqs)
#edFreqs[with(edFreqs, order(-edFreqs[,2])), ]
plot(table(edFreqs[,2]), type = "h", col = "red", lwd = 10, xlab = "n. copies", main = "Incunabula printed in Italy \n Extant copies per edition", ylab = "Freqs", sub = paste("Source: ISTC -- N = ", length(edFreqs[,2])))
plot(prop.table(table(edFreqs[,2])), xlab = "n. copies", ylab = "mass", main = "Incunabula printed in Italy \n Extant copies per edition")
#hist(textsFreqs[,2], probability = TRUE)
# Try to fit a power-law # en bonne logique, barres plutôt que ligne
#curve(0.4 * x^-1.3, add = TRUE, col = "red")
Number of edition per work
EdWorks = ISTC_wits[, c("X_id", "text")]
EdWorks = unique(EdWorks)
EdWorks = as.data.frame(table(EdWorks[,2]))
#View(EdWorks[with(EdWorks, order(-EdWorks[,2])), ])
plot(table(EdWorks[,2]), type = "h", col = "red", lwd = 10, xlab = "n. editions", main = "Incunabula printed in Italy \n Editions (with extant copies) per work", ylab = "Freqs", sub = paste("Source: ISTC -- N = ", length(EdWorks[,2])))
data = read.csv("data/Corpora_Merged_list_with_corrs_2_revu-Guidi.csv")
d = as.data.frame(table(data$TRI))
library(ggplot2)
ggplot(data=d, aes(x=Freq)) + geom_bar(stat = "count") + scale_y_continuous(trans='log10')
minne = read.csv(file = "data/minnesang/traditions_scraped.csv", header = TRUE, row.names=1,stringsAsFactors = FALSE)
#View(minne)
counts = minne[,4]
barplot(table(counts), main = "Distr. of wits per Minnesänger text", sub = "from Lyrik des Deutschen Mittelalters", ylab = "Freqs", xlab = "nb. witnesses")
plot(table(counts),, type = "h", col = "red", lwd = 10, main = "Distr. of wits per per Minnesänger text \n Data: Lyrik des Deutschen Mittelalters", xlab = "number of witnesses", ylab = "Freqs", sub = paste("N = ", sum(counts)))
#mtext(text = "nb. de témoins", line = 2, side= 1, cex=1.2)
plot(table(counts), main = "Kernel Density Estimation", sub = "Distribution of witnesses per epic text")